Libraries needed: [Note: I wrote results hide but the ## are not results maybe??]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
Loading data & Checking out how the loaded data looks like:
mouse.data <- Read10X(data.dir = "/Users/xiaoyizheng/Downloads/Praktikum/Bioinfo_Einarbeiten/CellChatSelf/mouse/GSE113854_RAW/")
mouse.data
## [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
## [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
## [[ suppressing 32 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
Creating Seurat object:
mouse <- CreateSeuratObject(counts = mouse.data, project = "MoSk", min.cells = 3, min.features = 200)
QC (quality control) Matrix for selection and filtration of cells, using user-defined criteria. In this case it is the mitochondrial genes as criteria. [note: apparently they all have same level of mt content. Maybe pre-processed already? or maybe that is not the proper criteria to choose in this case. Which one could be better instead of it?]
mouse[["percent.mt"]] <- PercentageFeatureSet(mouse, pattern = "^MT-")
#Visualization by violin plot (mt percent no significance):
VlnPlot(mouse, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE], idents =
## idents, : All cells have the same value of percent.mt.
Genes and total molecules calculated and stored in obj. meta data.
Showing QC metrics for the first 5 cells:
head(mouse@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGAATTCCC-1 MoSk 1475 783 0
## AAACCTGAGACCGGAT-1 MoSk 4148 1624 0
## AAACCTGAGACGCACA-1 MoSk 3136 1342 0
## AAACCTGAGATGTGGC-1 MoSk 3138 1323 0
## AAACCTGAGCTGCGAA-1 MoSk 4029 1721 0
#examining a few genes in the first thirty cells
mouse.data[c("Igf2", "Cck", "Pf4"), 1:30]
## 3 x 30 sparse Matrix of class "dgCMatrix"
## [[ suppressing 30 column names 'AAACCTGAGAATTCCC-1', 'AAACCTGAGACCGGAT-1', 'AAACCTGAGACGCACA-1' ... ]]
##
## Igf2 . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . . . . .
## Cck . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## Pf4 . . . . . . . . 11 . . . . . 25 . . 1 1 . 1 1 . 1 . . . . . 2
#a few genes in the first 30 cells
#picked "Igf2", "Cck", "Pf4" according to web
plot1 <- FeatureScatter(mouse, feature1 = "nCount_RNA", feature2 = "percent.mt")
## Warning in cor(x = data[, 1], y = data[, 2]): the standard deviation is zero
#since all have same mt quality...
plot2 <- FeatureScatter(mouse, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
#positively correlated
plot1 + plot2
since it is already normalized => this step omitted
mouse <- FindVariableFeatures(mouse, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(mouse), 10) # Identify the 10 most highly variable genes
top10
## [1] "Hbb-bs" "Hba-a1" "Ccl21a" "Hbb-bt" "Saa3" "Acp5" "Ccl5" "S100a8"
## [9] "Mmp9" "S100a9"
plot1 <- VariableFeaturePlot(mouse)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
all.genes <- rownames(mouse)
mouse <- ScaleData(mouse, features = all.genes)
## Centering and scaling data matrix
#optional: mouse <- ScaleData(mouse)
mouse <- RunPCA(mouse, features = VariableFeatures(object = mouse))
## PC_ 1
## Positive: Sparc, Bgn, Col1a2, Col1a1, Col3a1, Col6a1, Lgals1, Serpinh1, Col5a2, Fstl1
## Tmsb10, Col6a2, Col5a1, Col12a1, Aebp1, Dcn, Postn, Meg3, Fn1, Col6a3
## Mmp2, Fbln2, Lum, S100a6, Fosb, Loxl1, Thbs2, Serpinf1, Lrrc15, Tuba1a
## Negative: Tyrobp, Fcer1g, Lyz2, C1qb, Apoe, Ctss, C1qa, C1qc, Ms4a7, Aif1
## Cd52, Pf4, Wfdc17, Laptm5, Fth1, Cd74, Ms4a6c, Lgals3, Clec4n, Cd68
## Lst1, Alox5ap, H2-Aa, Ptpn18, Ucp2, Id2, AF251705, Trem2, Fcgr3, Clec4a2
## PC_ 2
## Positive: Col1a1, Col1a2, Col3a1, Mfap4, Lum, Postn, Col5a2, Col12a1, Dcn, Thbs2
## Igf1, Aebp1, Mdk, Mmp2, Itm2a, Col5a1, Spon1, Lox, Ptn, Olfml3
## Dpt, Cthrc1, Fndc1, Aspn, Serpinf1, Lrrc15, Col6a2, Mfap5, Col6a1, Prss35
## Negative: Cdh5, Col4a1, Col4a2, Pecam1, Egfl7, Igfbp7, Cd93, Crip2, Cav1, Ramp2
## Plvap, Vim, Col15a1, Myct1, Kdr, Adgrf5, Esam, Pdlim1, Emcn, Ctla2a
## Ecscr, Sparcl1, Col18a1, Eng, Adgrl4, Mcam, Itga6, Gimap6, Ehd4, F11r
## PC_ 3
## Positive: Sparcl1, Rgs5, Il6, Tinagl1, Igfbp7, Col4a1, Col4a2, Gm13889, Fabp4, Ebf1
## Ly6c1, Sept4, Emcn, Adgrf5, Ndufa4l2, Pecam1, Esam, Adgrl4, Flt1, Notch3
## Adamts9, Epas1, Des, Bcr, Vtn, Mcam, 2200002D01Rik, Higd1b, Apold1, Cygb
## Negative: Ftl1, Tyrobp, Fth1, Fcer1g, Lyz2, Cyba, Lgals3, Apoe, C1qb, Ctss
## C1qc, C1qa, Cstb, Laptm5, Tmsb4x, Lst1, Ctsd, Pf4, Ms4a7, Cd68
## Lgmn, Gm10116, Clec4n, Aif1, Wfdc17, Csf1r, Id2, Alox5ap, Ms4a6c, Ucp2
## PC_ 4
## Positive: Rgs5, Ndufa4l2, Gm13889, Crip1, Acta2, Higd1b, Des, Notch3, Cox4i2, Thy1
## Mylk, Pdgfa, S100a4, Serpine2, Ppp1r14a, Il6, Myl9, Rasgrp2, Ebf1, Tuba1b
## 2810417H13Rik, Cygb, Stmn1, Mgp, Ube2c, Mustn1, Cks2, Col4a1, Tpm1, H2afz
## Negative: Mfap4, Cdh5, Pecam1, Igf1, Col1a2, Cd34, Cd93, Ptn, Col3a1, Col1a1
## Ramp2, S100a16, Egfl7, Mest, Lum, Myct1, Dpt, Thbs2, Mdk, Itm2a
## Aebp1, Col14a1, Kdr, Ly6c1, Ogn, Ctla2a, C1qtnf3, Adgrl4, Emcn, Plvap
## PC_ 5
## Positive: Birc5, Stmn1, H2afz, Ube2c, Top2a, Hmgb2, Tuba1b, 2810417H13Rik, Cenpa, Cks2
## Cdca3, 2700094K13Rik, Cks1b, Ccnb2, Tubb5, Hmgb1, Cdk1, H2afx, Smc2, Lockd
## Ccnb1, Cdca8, Ccna2, Prc1, Ccdc34, H2afv, Ube2s, Hmgn2, Nusap1, Spc24
## Negative: Rgs5, Cxcl1, Gm13889, Il6, Meg3, Serping1, Errfi1, Serpine2, Mylk, Col4a1
## Notch3, Ndufa4l2, Ccl2, Ebf1, Mt1, Col4a2, Des, Neat1, Phlda1, Cygb
## Dnajb1, Malat1, Apold1, Col18a1, Tnfaip2, Adamts4, Higd1b, Nr4a1, Sparcl1, Bcr
print(mouse[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: Sparc, Bgn, Col1a2, Col1a1, Col3a1
## Negative: Tyrobp, Fcer1g, Lyz2, C1qb, Apoe
## PC_ 2
## Positive: Col1a1, Col1a2, Col3a1, Mfap4, Lum
## Negative: Cdh5, Col4a1, Col4a2, Pecam1, Egfl7
## PC_ 3
## Positive: Sparcl1, Rgs5, Il6, Tinagl1, Igfbp7
## Negative: Ftl1, Tyrobp, Fth1, Fcer1g, Lyz2
## PC_ 4
## Positive: Rgs5, Ndufa4l2, Gm13889, Crip1, Acta2
## Negative: Mfap4, Cdh5, Pecam1, Igf1, Col1a2
## PC_ 5
## Positive: Birc5, Stmn1, H2afz, Ube2c, Top2a
## Negative: Rgs5, Cxcl1, Gm13889, Il6, Meg3
VizDimLoadings(mouse, dims = 1:2, reduction = "pca")
DimPlot(mouse, reduction = "pca")
heatmap: exploration of the primary sources of heterogeneity -> decide which PCs to include for further downstream analyses
DimHeatmap(mouse, dims = 1, cells = 500, balanced = TRUE) #heatmap: exploration of the primary sources of heterogeneity
DimHeatmap(mouse, dims = 1:15, cells = 500, balanced = TRUE)
# VII. Determination of Dimension Metafeature combines information
across a correlated feature set. top principal components => robust
compression how many PCs to take? ###JackStraw procedure permute subset
of the data (1% by default) and rerun PCA significant PCs ~ strong
enrichment of low p-value features
mouse <- JackStraw(mouse, num.replicate = 100)
mouse <- ScoreJackStraw(mouse, dims = 1:20)
JackStrawPlot(mouse, dims = 1:15)
## Warning: Removed 21000 rows containing missing values (`geom_point()`).
alternative heuristic method Elbow plot: observe where the elbow is and then see which PCs to choose
ElbowPlot(mouse)
# VIII. Clustering Cells
mouse <- FindNeighbors(mouse, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
mouse <- FindClusters(mouse, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22322
## Number of edges: 704154
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9039
## Number of communities: 15
## Elapsed time: 3 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(mouse), 5)
## AAACCTGAGAATTCCC-1 AAACCTGAGACCGGAT-1 AAACCTGAGACGCACA-1 AAACCTGAGATGTGGC-1
## 3 1 13 1
## AAACCTGAGCTGCGAA-1
## 7
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
input to the UMAP and tSNE: same PCs as input to the clustering analysis
mouse <- RunUMAP(mouse, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:24:37 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:24:37 Read 22322 rows and found 10 numeric columns
## 17:24:37 Using Annoy for neighbor search, n_neighbors = 30
## 17:24:37 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:24:39 Writing NN index file to temp file /var/folders/w_/82bj411144v93vqxt1tbsg9h0000gn/T//Rtmp5dFdA6/file9cc3fecef22
## 17:24:39 Searching Annoy index using 1 thread, search_k = 3000
## 17:24:46 Annoy recall = 100%
## 17:24:46 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 17:24:47 Initializing from normalized Laplacian + noise (using irlba)
## 17:24:48 Commencing optimization for 200 epochs, with 905958 positive edges
## 17:25:00 Optimization finished
#note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
DimPlot(mouse, reduction = "umap")
#saveRDS(mouse, file = "/Users/xiaoyizheng/Downloads/Bioinfo_Einarbeiten/CellChatSelf/mouse/output/mouseOutput.rds")
# find all markers of cluster 2
cluster2.markers <- FindMarkers(mouse, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Col3a1 0 0.9038735 1.000 0.962 0
## Fmod 0 0.4638764 0.308 0.068 0
## Angptl1 0 0.4226594 0.327 0.075 0
## Dpt 0 0.7186185 0.715 0.330 0
## Mdk 0 0.8305024 0.836 0.405 0
cluster5.markers <- FindMarkers(mouse, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Col3a1 0 -1.9613086 0.873 0.997 0
## Col5a2 0 -1.1786174 0.381 0.858 0
## Col6a3 0 -1.1001785 0.400 0.882 0
## Selp 0 0.9992272 0.312 0.008 0
## F11r 0 0.5869482 0.373 0.011 0
mouse.markers <- FindAllMarkers(mouse, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) #No features pass logfc.threshold threshold, maybe threshold=0.1, 0.15, 0.2?
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
mouse.markers <- FindAllMarkers(mouse, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.2) #0.2 is okki
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
mouse.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 29 × 7
## # Groups: cluster [15]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 1.59e-116 0.256 0.764 0.531 2.72e-112 0 Crabp1
## 2 1.26e- 99 1.42 0.338 0.186 2.15e- 95 1 Saa3
## 3 0 1.04 0.909 0.513 0 1 Crabp1
## 4 0 1.58 0.91 0.427 0 2 Ptn
## 5 0 1.41 0.794 0.328 0 2 H19
## 6 0 2.13 0.958 0.234 0 3 Rgs5
## 7 0 1.35 0.422 0.123 0 3 Mgp
## 8 0 1.06 0.739 0.359 0 4 Il1b
## 9 6.01e-232 1.01 0.61 0.284 1.03e-227 4 Cd74
## 10 0 2.52 0.902 0.204 0 5 Ctla2a
## # … with 19 more rows
#cluster0.markers <- FindMarkers(mouse, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
cluster0.markers <- FindMarkers(mouse, ident.1 = 0, logfc.threshold = 0.2, test.use = "roc", only.pos = TRUE)
VlnPlot(mouse, features = c("Rgs5", "Tyrobp"))
VlnPlot(mouse, features = c("Col1a1", "Col1a2"), slot = "counts", log = TRUE) #plot raw counts
FeaturePlot(mouse, features = c("Sparc", "Tyrobp", "Igfbp7", "Pecam1", "Cd34", "Mfap4", "Ndufa4l2", "Il6",
"Birc5"))
mouse.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(mouse, features = top10$gene) + NoLegend()